Dataset : US Police Shootings
Data Source : https://www.kaggle.com/ahsen1330/us-police-shootings
Libraries used:
# importing all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import matplotlib
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest
matplotlib.style.use('fivethirtyeight')
pio.templates.default = "plotly_dark"
plt.rcParams.update({'font.size': 18})
# Reading the dataset into a data frame
us_police_shootings=pd.read_csv("shootings.csv")
# Seeing at what the datasetlooks like
us_police_shootings.head()
| id | name | date | manner_of_death | armed | age | gender | race | city | state | signs_of_mental_illness | threat_level | flee | body_camera | arms_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3 | Tim Elliot | 2015-01-02 | shot | gun | 53.0 | M | Asian | Shelton | WA | True | attack | Not fleeing | False | Guns |
| 1 | 4 | Lewis Lee Lembke | 2015-01-02 | shot | gun | 47.0 | M | White | Aloha | OR | False | attack | Not fleeing | False | Guns |
| 2 | 5 | John Paul Quintero | 2015-01-03 | shot and Tasered | unarmed | 23.0 | M | Hispanic | Wichita | KS | False | other | Not fleeing | False | Unarmed |
| 3 | 8 | Matthew Hoffman | 2015-01-04 | shot | toy weapon | 32.0 | M | White | San Francisco | CA | True | attack | Not fleeing | False | Other unusual objects |
| 4 | 9 | Michael Rodriguez | 2015-01-04 | shot | nail gun | 39.0 | M | Hispanic | Evans | CO | False | attack | Not fleeing | False | Piercing objects |
# Checking number of rows and columns
us_police_shootings.shape
(4895, 15)
# Checking for null values in each column
us_police_shootings.isnull().sum()
id 0 name 0 date 0 manner_of_death 0 armed 0 age 0 gender 0 race 0 city 0 state 0 signs_of_mental_illness 0 threat_level 0 flee 0 body_camera 0 arms_category 0 dtype: int64
# Different columns in the data set
us_police_shootings.columns
Index(['id', 'name', 'date', 'manner_of_death', 'armed', 'age', 'gender',
'race', 'city', 'state', 'signs_of_mental_illness', 'threat_level',
'flee', 'body_camera', 'arms_category'],
dtype='object')
# Unique values in the armed column
us_police_shootings['armed'].unique()
array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'unknown',
'shovel', 'hammer', 'hatchet', 'sword', 'machete', 'box cutter',
'metal object', 'screwdriver', 'lawn mower blade', 'flagpole',
'guns and explosives', 'cordless drill', 'metal pole', 'Taser',
'metal pipe', 'metal hand tool', 'blunt object', 'metal stick',
'sharp object', 'meat cleaver', 'carjack', 'chain',
"contractor's level", 'stapler', 'crossbow', 'bean-bag gun',
'baseball bat and fireplace poker', 'straight edge razor',
'gun and knife', 'ax', 'brick', 'baseball bat', 'hand torch',
'chain saw', 'garden tool', 'scissors', 'pole', 'pick-axe',
'flashlight', 'vehicle', 'spear', 'chair', 'pitchfork',
'hatchet and gun', 'rock', 'piece of wood', 'bayonet', 'pipe',
'glass shard', 'motorcycle', 'pepper spray', 'metal rake', 'baton',
'crowbar', 'oar', 'machete and gun', 'air conditioner',
'pole and knife', 'beer bottle', 'baseball bat and bottle',
'fireworks', 'pen', 'chainsaw', 'gun and sword', 'gun and car',
'pellet gun', 'BB gun', 'incendiary device', 'samurai sword',
'bow and arrow', 'gun and vehicle', 'vehicle and gun', 'wrench',
'walking stick', 'barstool', 'grenade', 'BB gun and vehicle',
'wasp spray', 'air pistol', 'baseball bat and knife',
'vehicle and machete', 'ice pick', 'car, knife and mace'],
dtype=object)
# Unique values in the state column
us_police_shootings['state'].unique()
array(['WA', 'OR', 'KS', 'CA', 'CO', 'OK', 'AZ', 'IA', 'PA', 'TX', 'OH',
'LA', 'MT', 'UT', 'AR', 'IL', 'NV', 'NM', 'MN', 'MO', 'VA', 'NJ',
'IN', 'KY', 'MA', 'NH', 'FL', 'ID', 'MD', 'NE', 'MI', 'GA', 'TN',
'NC', 'AK', 'NY', 'ME', 'AL', 'MS', 'WI', 'SC', 'DE', 'DC', 'WV',
'HI', 'WY', 'ND', 'CT', 'SD', 'VT', 'RI'], dtype=object)
# Unique values in the arms_category column
us_police_shootings['arms_category'].unique()
array(['Guns', 'Unarmed', 'Other unusual objects', 'Piercing objects',
'Sharp objects', 'Unknown', 'Blunt instruments', 'Multiple',
'Electrical devices', 'Hand tools', 'Vehicles', 'Explosives'],
dtype=object)
us_police_shootings['count'] = 1
runs=us_police_shootings.groupby(['armed'])['count'].sum().reset_index()
runs.columns=['armed','total_shootings']
max_shootings=runs.groupby(['armed'])['total_shootings'].sum()
# Plotting the bar graph
plt.figure(figsize=(15,10))
max_shootings.sort_values(ascending=False)[:10].plot.bar()
plt.ylabel('Total shootings')
plt.xlabel('Type of weapon')
plt.title('Top 10 weapons')
Text(0.5, 1.0, 'Top 10 weapons')
# BAR GRAPH OF RACES OF THE SHOOTERS
plt.figure(figsize=(15,10))
sns.countplot(x="race", data=us_police_shootings)
plt.show()
# MANNER OF DEATH GRAPH
manner_of_death = us_police_shootings['manner_of_death'].value_counts().reset_index().rename(
columns={'index':'manner of death','manner_of_death':'count'})
plt.figure(figsize=(8,10))
ax = sns.barplot(x = 'manner of death',
y = 'count',
data = manner_of_death,
palette = "Set2")
plt.title("Frequency vs Manner of Death")
plt.xlabel("Manner of Death")
plt.ylabel("Frequency")
plt.show()
#GENDER PIE CHART
us_police_shootings.groupby('gender').size().plot(kind='pie',
autopct='%.2f',
ylabel='Gender',
figsize=(10,10))
<AxesSubplot:ylabel='Gender'>
# PIECHART OF STATES OF SHOOTINGS
state_shooting=us_police_shootings["state"].value_counts()
fig=px.pie(state_shooting,
values=state_shooting.values,
names=state_shooting.index,
title="Police shootings in US States",
height=900)
fig.show()
city = us_police_shootings['city'].value_counts().reset_index().rename(
columns={'index':'City', 'city':'Total Shootings'}).head(20)
fig = px.bar(city, x = 'City',
y = 'Total Shootings',
text = 'Total Shootings',
title='Top 20 Cities with most shootings',
height=800,
width=900)
fig.update_traces(marker_color=['pink','plum','mediumorchid', 'darkviolet','mediumpurple',
'purple','indigo','thistle','mediumvioletred','blueviolet',
'yellow','green','red','grey','gold','blue','orange',
'yellowgreen','indianred','cyan'])
fig.show()
medians = us_police_shootings.groupby('gender')['age'].median()
plt.figure(figsize=(15,5))
box = sns.boxplot(x=us_police_shootings['gender'], y=us_police_shootings['age'])
for i in range(len(medians)):
box.annotate(str(medians[i]),xy = (i, medians[i]), horizontalalignment = 'center')
plt.title('Gender VS Age Distribution', fontsize=20)
Text(0.5, 1.0, 'Gender VS Age Distribution')
#ARMS CATEGORY USED DURING SHOOTING.
arms_category = us_police_shootings['arms_category'].value_counts().reset_index().rename(
columns={'index':'Arms Category', 'arms_category':'Total'})
fig = px.bar(arms_category,
x = 'Arms Category',
y = 'Total',
text = 'Total',
title='Arms Category Used',
height=800,
width=900)
fig.update_traces(marker_color=['pink','lightgreen','gold','darkviolet','red','purple',
'orange','thistle','indianred','blueviolet'])
fig.show()
count = us_police_shootings['signs_of_mental_illness'].value_counts()
pie_list=[count[False],count[True]]
mental_labels=["Mentally Stable","Not Mentally Stable"]
def absolute_value(val):
return f'{val:.2f}%'
plt.figure(figsize=(10,10))
plt.pie(pie_list,labels=mental_labels, autopct=absolute_value)
plt.title("Mental Illness")
plt.legend()
<matplotlib.legend.Legend at 0x7f4245c18700>
us_police_shootings['threat_level']
count_threats=us_police_shootings['threat_level'].value_counts()
num_threats=count_threats['attack']+count_threats['other']+count_threats['undetermined']
# Perform one proportion z-test
proportions_ztest(count=count_threats['attack'],nobs=num_threats,value=.64)
(0.812740629941684, 0.4163667811506514)
To conclude p-value = 0.416366 > 0.05, we accept the null hypothesis and can say that attack threats are 64% of all the threats. For proof bar plot has been plotted below:
threat_values=[count_threats['attack'],count_threats['other'],count_threats['undetermined']]
threat_labels=us_police_shootings['threat_level'].unique()
plt.figure(figsize=(8,10))
plt.bar(threat_labels,threat_values,color='green',width=0.5)
plt.title("Threat Types")
plt.xlabel("Threats")
plt.ylabel("Number of Threats")
Text(0, 0.5, 'Number of Threats')
# Replacing all the values with a single value where car, foot and others are used as Fleeing
gender_flee= us_police_shootings[['gender','flee']]
gender_flee.loc[(gender_flee.flee != 'Not fleeing'),'flee']='fleeing'
# Creating a two way table
cross_table = pd.crosstab(gender_flee['gender'],
gender_flee['flee'],margins = False)
cross_table.plot(kind='bar',
rot=0,color={'fleeing': "red", 'Not fleeing': "green"},
figsize=(10,7))
# Plotting the graph
plt.legend(['Not fleeing','Fleeing'],labelcolor='black')
plt.xlabel('Gender')
plt.ylabel('Numbers')
plt.title('Male Female Flee Chart', color='black')
/home/aman/Documents/my_env/lib/python3.8/site-packages/pandas/core/indexing.py:1637: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /home/aman/Documents/my_env/lib/python3.8/site-packages/pandas/core/indexing.py:692: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Text(0.5, 1.0, 'Male Female Flee Chart')
us_police_shootings["body_camera"] = us_police_shootings["body_camera"].apply(str)
cam = us_police_shootings[["race","body_camera"]]
cam_off = cam[cam["body_camera"]=="False"]
cam_off = cam_off["race"].value_counts()
fig = px.pie(cam_off,
values = cam_off.values,
names = cam_off.index,
title = "People belonging to Race who did not have Body Camera")
fig.show()
# RACE VS AGE Boxplot
medians = us_police_shootings.groupby('race')['age'].median()
plt.figure(figsize=(15,10))
box = sns.boxplot(x=us_police_shootings['race'], y=us_police_shootings['age'])
for i in range(len(medians)):
box.annotate(str(medians[i]),xy = (i, medians[i]), horizontalalignment = 'center')
plt.title('Race VS Age Distribution', fontsize=22)
Text(0.5, 1.0, 'Race VS Age Distribution')
ms_cat=us_police_shootings[us_police_shootings["arms_category"]!="Guns"]
no_guns_plot=ms_cat["state"].value_counts()
plt.figure(figsize=(15,25))
sns.barplot(x=no_guns_plot,y=no_guns_plot.index)
plt.xlabel("Count")
plt.ylabel("States")
plt.title("Frequency of Objects other than guns in different States")
plt.plot()
[]
victim_with_gun = us_police_shootings[us_police_shootings['arms_category'] == 'Guns'][['state','arms_category']].groupby('state').count().reset_index()
fig = px.bar(victim_with_gun,
x = 'arms_category',
y ='state',
text="arms_category",
color ='state',
width = 970,
height = 1500)
fig.show()
# Replacing all the values with a single value where armed value is not unarmed
mental_health_weapon_use = us_police_shootings[['armed','signs_of_mental_illness']]
mental_health_weapon_use.loc[(mental_health_weapon_use.armed != 'unarmed'),'armed']='armed'
# Creating a two way table
cross_table = pd.crosstab(mental_health_weapon_use['signs_of_mental_illness'],
mental_health_weapon_use['armed'],margins = False)
cross_table.plot(kind='bar',
rot=0,color={'armed': "red", 'unarmed': "green"},
figsize=(15,10))
# Plotting the graph
plt.legend(['Armed','Unarmed'],labelcolor='black')
plt.xlabel('Sign of mental illness')
plt.ylabel('Number of shootings')
plt.title('Relationship between armed and sign of mental illness', color='black')
/home/aman/Documents/my_env/lib/python3.8/site-packages/pandas/core/indexing.py:1720: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Text(0.5, 1.0, 'Relationship between armed and sign of mental illness')
us_police_shootings['year'] = pd.DatetimeIndex(us_police_shootings['date']).year
shootings_each_year = us_police_shootings['year'].value_counts().reset_index().rename(
columns={'index':'year','year':'no of shootings'}).groupby('year').sum().reset_index()
# Plotting the graph
plt.figure(figsize=(10,10))
plt.plot(shootings_each_year['year'],
shootings_each_year['no of shootings'],
color='green',
marker='o',
linestyle='solid')
[<matplotlib.lines.Line2D at 0x7f423fcac5b0>]
df=pd.read_csv('shootings.csv')
df['count'] = 1
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
age=df['age']
plt.figure(figsize =(10, 10))
plt.boxplot(age)
plt.xlabel("Plot")
plt.ylabel("Age")
box = sns.boxplot(data=age)
for i in range(len(medians)):
box.annotate(str(medians[i]),xy = (i, medians[i]), horizontalalignment = 'center')
The above boxplot of age shows there are ouliers. Now, we will find outlier information.
median = np.median(age)
upper_quartile = np.percentile(age, 75)
lower_quartile = np.percentile(age, 25)
iqr = upper_quartile - lower_quartile
upper_whisker = age[age<=upper_quartile+1.5*iqr].max()
lower_whisker = age[age>=lower_quartile-1.5*iqr].min()
upper_whisker
72.0
People above 72 is considered outlier in box plot.
outlier_information=df[df['age']>72]
outlier_information['count'].count()
37
Since,The number of people is significant in number(37) whose age is cosidered outlier by box plot,we will consider them for analysis and will not delete them.
plt.figure(figsize=(15,10))
sns.set_style("darkgrid")
sns.distplot(df['age'],kde=True,color='b')
/home/aman/Documents/my_env/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
<AxesSubplot:xlabel='age', ylabel='Density'>
From Displot, we can see it it is very slightly positively skewed. But, We can confidently say it is following normal distribution.
plt.figure(figsize=(15,10))
stats.probplot(df['age'],plot=plt)
((array([-3.63021417, -3.39456193, -3.26466538, ..., 3.26466538,
3.39456193, 3.63021417]),
array([ 6., 6., 12., ..., 84., 84., 91.])),
(12.449614166604643, 36.549749566213784, 0.980154295559862))
From Probplot, Majority of data is following line. So, We can say it is following normal distribution.
The Shapiro-Wilk’s test or Shapiro test is a normality test in frequentist statistics. The null hypothesis of Shapiro’s test is that the population is distributed normally.
stats.shapiro(df['age'])
ShapiroResult(statistic=0.9606190323829651, pvalue=1.6336742685352274e-34)
By abpplying shapiro test in order to check whether age is following normal distribution or not. We came to know that p values is equal to 0 that signifies data follows normal distribution.
year_month_totalshootings=df.groupby(['year','month'])['count'].sum().reset_index()
year_month_totalshootings.columns=['year','month','total_shootings']
heatmap_year_month_totalshootings=year_month_totalshootings.pivot('month','year','total_shootings')
plt.figure(figsize=(15,10))
sns.heatmap(heatmap_year_month_totalshootings,annot=True)
<AxesSubplot:xlabel='year', ylabel='month'>
Top three months when more number of people were shot is 01-2019,01-2018 and 12-2019 respectively.
age_range=["[{0} - {1})".format(age,age+10)for age in range(0,101,10)]
count_age_ranges=len(age_range)
df['age_range']=pd.cut(x=df['age'],bins=count_age_ranges,labels=age_range)
agerange_totalshootings=df.groupby(['age_range'])['count'].sum().reset_index()
agerange_totalshootings.columns=['age_range','total_shootings']
agerange_totalshootings# .plot.bar()
##
mental=df[df['signs_of_mental_illness']==True]
agerange_totalshootings_mental=mental.groupby(['age_range'])['count'].sum().reset_index()
agerange_totalshootings_mental.columns=['age_range','total_shootings']
agerange_totalshootings_mental
agerange_totalshootings['mentally disable']=agerange_totalshootings_mental['total_shootings']
agerange_totalshootings['percentage of mentally disable']=(agerange_totalshootings['mentally disable']/agerange_totalshootings['total_shootings'])*100
agerange_totalshootings['percentage of not mentally disable']=100-agerange_totalshootings['percentage of mentally disable']
##
plt.figure(figsize=(16, 7))
plt.bar(agerange_totalshootings['age_range'],agerange_totalshootings['percentage of mentally disable'] , color='g')
plt.bar(agerange_totalshootings['age_range'], agerange_totalshootings['percentage of not mentally disable'],bottom=agerange_totalshootings['percentage of mentally disable'] ,color='y')
plt.xlabel("Age bracket")
plt.ylabel("Percentage of mentally ill and well")
plt.show()
in age bracket of '70 to 80' and '100-110',more people was mentally ill as compared to other age brackets.We can say that by increase in age,percentage of mentally ill people also increases.
age_below_maturityage=df.groupby(['state'])['age'].quantile(0.05).reset_index()
age_below_maturityage.columns=['state','age']
age_below_maturityage.sort_values('age').head(5)
| state | age | |
|---|---|---|
| 14 | IL | 17.0 |
| 2 | AR | 17.6 |
| 6 | CT | 17.9 |
| 22 | MI | 18.0 |
| 21 | ME | 18.0 |